output.var = params$output.var
transform.abs = FALSE
log.pred = params$log.pred
norm.pred = FALSE
eda = params$eda
algo.forward.caret = params$algo.forward.caret
algo.backward.caret = params$algo.backward.caret
algo.stepwise.caret = params$algo.stepwise.caret
algo.LASSO.caret = params$algo.LASSO.caret
algo.LARS.caret = params$algo.LARS.caret
message("Parameters used for training/prediction: ")
## Parameters used for training/prediction:
str(params)
## List of 8
## $ output.var : chr "y3"
## $ log.pred : logi TRUE
## $ eda : logi FALSE
## $ algo.forward.caret : logi TRUE
## $ algo.backward.caret: logi TRUE
## $ algo.stepwise.caret: logi TRUE
## $ algo.LASSO.caret : logi TRUE
## $ algo.LARS.caret : logi TRUE
# Setup Labels
output.var.tr = if (log.pred == TRUE) paste0(output.var,'.log') else output.var.tr = output.var
feat = read.csv('../../Data/features_highprec.csv')
labels = read.csv('../../Data/labels.csv')
predictors = names(dplyr::select(feat,-JobName))
data.ori = inner_join(feat,labels,by='JobName')
#data.ori = inner_join(feat,select_at(labels,c('JobName',output.var)),by='JobName')
cc = complete.cases(data.ori)
data.notComplete = data.ori[! cc,]
data = data.ori[cc,] %>% select_at(c(predictors,output.var,'JobName'))
message('Original cases: ',nrow(data.ori))
## Original cases: 10000
message('Non-Complete cases: ',nrow(data.notComplete))
## Non-Complete cases: 3020
message('Complete cases: ',nrow(data))
## Complete cases: 6980
summary(dplyr::select_at(data,c('JobName',output.var)))
## JobName y3
## Job_00001: 1 Min. : 95.91
## Job_00002: 1 1st Qu.:118.29
## Job_00003: 1 Median :124.03
## Job_00004: 1 Mean :125.40
## Job_00007: 1 3rd Qu.:131.06
## Job_00008: 1 Max. :193.73
## (Other) :6974
The Output Variable y3 shows right skewness, so will proceed with a log transformation
df=gather(select_at(data,output.var))
ggplot(df, aes(x=value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density()
#stat_function(fun = dnorm, n = 100, args = list(mean = mean(df$value), sd = sd(df$value)))
ggplot(gather(select_at(data,output.var)), aes(sample=value)) +
stat_qq() +
facet_wrap(~key, scales = 'free',ncol=4)
if(log.pred==TRUE) data[[output.var.tr]] = log(data[[output.var]],10) else
data[[output.var.tr]] = data[[output.var]]
df=gather(select_at(data,c(output.var,output.var.tr)))
ggplot(df, aes(value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density() +
# stat_function(fun = dnorm, n = 100, args = list(mean = mean(df$value), sd = sd(df$value)))
facet_wrap(~key, scales = 'free',ncol=2)
ggplot(gather(select_at(data,c(output.var,output.var.tr))), aes(sample=value)) +
stat_qq() +
facet_wrap(~key, scales = 'free',ncol=4)
Normalization of y3 using bestNormalize package. (suggested orderNorm) This is cool, but I think is too far for the objective of the project
t=bestNormalize::bestNormalize(data[[output.var]])
t
## Best Normalizing transformation with 6980 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - No transform: 2.9627
## - Box-Cox: 1.426
## - Log_b(x+a): 1.9884
## - sqrt(x+a): 2.4513
## - exp(x): 749.4167
## - arcsinh(x): 1.9884
## - Yeo-Johnson: 1.1169
## - orderNorm: 1.1737
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## Standardized Yeo-Johnson Transformation with 6980 nonmissing obs.:
## Estimated statistics:
## - lambda = -1.998639
## - mean (before standardization) = 0.5003083
## - sd (before standardization) = 5.108542e-06
qqnorm(data[[output.var]])
qqnorm(predict(t))
orderNorm() is a rank-based procedure by which the values of a vector are mapped to their percentile, which is then mapped to the same percentile of the normal distribution. Without the presence of ties, this essentially guarantees that the transformation leads to a uniform distribution
All predictors show a Fat-Tail situation, where the two tails are very tall, and a low distribution around the mean. The orderNorm transformation can help (see [Best Normalizator] section)
Histograms
if (eda == TRUE){
cols = c('x11','x18','stat98','x7','stat110')
df=gather(select_at(data,cols))
ggplot(df, aes(value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density() +
# stat_function(fun = dnorm, n = 100, args = list(mean = mean(df$value), sd = sd(df$value)))
facet_wrap(~key, scales = 'free',ncol=3)
# ggplot(gather(select_at(data,cols)), aes(sample=value)) +
# stat_qq()+
# facet_wrap(~key, scales = 'free',ncol=2)
lapply(select_at(data,cols),summary)
}
Scatter plot vs. output variable **y3.log
if (eda == TRUE){
d = gather(dplyr::select_at(data,c(cols,output.var.tr)),key=target,value=value,-!!output.var.tr)
ggplot(data=d, aes_string(x='value',y=output.var.tr)) +
geom_point(color='light green',alpha=0.5) +
geom_smooth() +
facet_wrap(~target, scales = 'free',ncol=3)
}
All indicators have a strong indication of Fat-Tails
if (eda == TRUE){
df=gather(select_at(data,predictors))
ggplot(df, aes(value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density() +
# stat_function(fun = dnorm, n = 100, args = list(mean = mean(df$value), sd = sd(df$value)))
facet_wrap(~key, scales = 'free',ncol=4)
}
if (eda == TRUE){
#chart.Correlation(select(data,-JobName), pch=21)
t=as.data.frame(round(cor(dplyr::select(data,-one_of(output.var.tr,'JobName'))
,select_at(data,output.var.tr)),4)) %>%
rownames_to_column(var='variable') %>% filter(variable != !!output.var) %>% arrange(-y3.log)
#DT::datatable(t)
message("Top Positive")
kable(head(arrange(t,desc(y3.log)),20))
message("Top Negative")
kable(head(arrange(t,y3.log),20))
}
if (eda == TRUE){
#chart.Correlation(select(data,-JobName), pch=21)
t=as.data.frame(round(cor(dplyr::select(data,-one_of('JobName'))),4))
#DT::datatable(t,options=list(scrollX=T))
message("Showing only 10 variables")
kable(t[1:10,1:10])
}
Scatter plots with all predictors and the output variable (y3.log)
if (eda == TRUE){
d = gather(dplyr::select_at(data,c(predictors,output.var.tr)),key=target,value=value,-!!output.var.tr)
ggplot(data=d, aes_string(x='value',y=output.var.tr)) +
geom_point(color='light blue',alpha=0.5) +
geom_smooth() +
facet_wrap(~target, scales = 'free',ncol=4)
}
No Multicollinearity among predictors
Showing Top predictor by VIF Value
if (eda == TRUE){
vifDF = usdm::vif(select_at(data,predictors)) %>% arrange(desc(VIF))
head(vifDF,15)
}
data.tr=data %>%
mutate(x18.sqrt = sqrt(x18))
cols=c('x18','x18.sqrt')
# ggplot(gather(select_at(data.tr,cols)), aes(value)) +
# geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
# geom_density() +
# facet_wrap(~key, scales = 'free',ncol=4)
d = gather(dplyr::select_at(data.tr,c(cols,output.var.tr)),key=target,value=value,-!!output.var.tr)
ggplot(data=d, aes_string(x='value',y=output.var.tr)) +
geom_point(color='light blue',alpha=0.5) +
geom_smooth() +
facet_wrap(~target, scales = 'free',ncol=4)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#removing unwanted variables
data.tr=data.tr %>%
dplyr::select_at(names(data.tr)[! names(data.tr) %in% c('x18','y3','JobName')])
data=data.tr
label.names=output.var.tr
# 0 for no interaction,
# 1 for Full 2 way interaction and
# 2 for Selective 2 way interaction
# 3 for Selective 3 way interaction
InteractionMode = 3
pca.vars = names(data)
pca.vars = pca.vars[!pca.vars %in% label.names]
if(InteractionMode == 1){
pca.formula =as.formula(paste0('~(',paste0(pca.vars, collapse ='+'),')^2'))
pca.model = prcomp(formula=pca.formula,data=data[,pca.vars],center=T,scale.=T,retx = T)
#saveRDS(pca.model,'pca.model.rds')
}
if (InteractionMode == 0){
pca.model = prcomp(x=data[,pca.vars],center=T,scale.=T,retx = T)
}
if (InteractionMode >= 2 & InteractionMode <= 3){
controlled.vars = pca.vars[grep("^x",pca.vars)]
stat.vars = pca.vars[grep("^stat",pca.vars)]
if (InteractionMode >= 2){
interaction.form = paste0('~(',paste0(controlled.vars, collapse ='+'),')^2')
}
if (InteractionMode >= 3){
interaction.form = paste0('~(',paste0(controlled.vars, collapse ='+'),')^3')
}
no.interact.form = paste0(stat.vars, collapse ='+')
pca.formula = as.formula(paste(interaction.form, no.interact.form, sep = "+"))
pca.model = prcomp(formula=pca.formula,data=data[,pca.vars],center=T,scale.=T,retx = T)
}
targetCumVar = .8
pca.model$var = pca.model$sdev ^ 2 #eigenvalues
pca.model$pvar = pca.model$var / sum(pca.model$var)
pca.model$cumpvar = cumsum(pca.model$pvar )
pca.model$pcaSel = pca.model$cumpvar<=targetCumVar
pca.model$pcaSelCount = sum(pca.model$pcaSel)
pca.model$pcaSelTotVar = sum(pca.model$pvar[pca.model$pcaSel])
message(pca.model$pcaSelCount, " PCAs justify ",percent(targetCumVar)," of the total Variance. (",percent(pca.model$pcaSelTotVar),")")
## 21 PCAs justify 80.0% of the total Variance. (79.8%)
plot(pca.model$var,xlab="Principal component", ylab="Proportion of variance explained", type='b')
plot(cumsum(pca.model$pvar ),xlab="Principal component", ylab="Cumulative Proportion of variance explained", ylim=c(0,1), type='b')
screeplot(pca.model,npcs = pca.model$pcaSelCount)
screeplot(pca.model,npcs = pca.model$pcaSelCount,type='lines')
#summary(pca.model)
#pca.model$rotation
#creating dataset
data.pca = dplyr::select(data,!!label.names) %>%
dplyr::bind_cols(dplyr::select(as.data.frame(pca.model$x)
,!!colnames(pca.model$rotation)[pca.model$pcaSel])
)
data.pca = data.pca[sample(nrow(data.pca)),] # randomly shuffle data
split = sample.split(data.pca[,label.names], SplitRatio = 0.8)
data.train = subset(data.pca, split == TRUE)
data.test = subset(data.pca, split == FALSE)
plot.diagnostics <- function(model, train) {
plot(model)
residuals = resid(model) # Plotted above in plot(lm.out)
r.standard = rstandard(model)
r.student = rstudent(model)
df = data.frame(x=predict(model,train),y=r.student)
p=ggplot(data=df,aes(x=x,y=y)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
geom_hline(yintercept = 0,size=1)+
ylab("Student Residuals") +
xlab("Predicted Values")+
ggtitle("Student Residual Plot")
plot(p)
df = data.frame(x=predict(model,train),y=r.standard)
p=ggplot(data=df,aes(x=x,y=y)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
geom_hline(yintercept = c(-2,0,2),size=1)+
ylab("Student Residuals") +
xlab("Predicted Values")+
ggtitle("Student Residual Plot")
plot(p)
# Histogram
df=data.frame(r.student)
p=ggplot(data=df,aes(r.student)) +
geom_histogram(aes(y=..density..),bins = 50,fill='blue',alpha=0.6) +
stat_function(fun = dnorm, n = 100, args = list(mean = 0, sd = 1)) +
ylab("Density")+
xlab("Studentized Residuals")+
ggtitle("Distribution of Studentized Residuals")
plot(p)
# http://www.stat.columbia.edu/~martin/W2024/R7.pdf
# Influential plots
inf.meas = influence.measures(model)
# print (summary(inf.meas)) # too much data
# Leverage plot
lev = hat(model.matrix(model))
df=tibble::rownames_to_column(as.data.frame(lev),'id')
p=ggplot(data=df,aes(x=as.numeric(id),y=lev)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
ylab('Leverage - check') +
xlab('Index')
plot(p)
# Cook's Distance
cd = cooks.distance(model)
df=tibble::rownames_to_column(as.data.frame(cd),'id')
p=ggplot(data=df,aes(x=as.numeric(id),y=cd)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
geom_text(data=filter(df,cd>15/nrow(train)),aes(label=id),check_overlap=T,size=3,vjust=-.5)+
ylab('Cooks distances') +
geom_hline(yintercept = c(4/nrow(train),0),size=1)+
xlab('Index')
plot(p)
print (paste("Number of data points that have Cook's D > 4/n: ", length(cd[cd > 4/nrow(train)]), sep = ""))
print (paste("Number of data points that have Cook's D > 1: ", length(cd[cd > 1]), sep = ""))
return(cd)
}
# function to set up random seeds
# Based on http://jaehyeon-kim.github.io/2015/05/Setup-Random-Seeds-on-Caret-Package.html
setCaretSeeds <- function(method = "cv", numbers = 1, repeats = 1, tunes = NULL, seed = 1701) {
#B is the number of resamples and integer vector of M (numbers + tune length if any)
B <- if (method == "cv") numbers
else if(method == "repeatedcv") numbers * repeats
else NULL
if(is.null(length)) {
seeds <- NULL
} else {
set.seed(seed = seed)
seeds <- vector(mode = "list", length = B)
seeds <- lapply(seeds, function(x) sample.int(n = 1000000
, size = numbers + ifelse(is.null(tunes), 0, tunes)))
seeds[[length(seeds) + 1]] <- sample.int(n = 1000000, size = 1)
}
# return seeds
seeds
}
train.caret.glmselect = function(formula, data, method
,subopt = NULL, feature.names
, train.control = NULL, tune.grid = NULL, pre.proc = NULL){
if(is.null(train.control)){
train.control <- trainControl(method = "cv"
,number = 10
,seeds = setCaretSeeds(method = "cv"
, numbers = 10
, seed = 1701)
,search = "grid"
,verboseIter = TRUE
,allowParallel = TRUE
)
}
if(is.null(tune.grid)){
if (method == 'leapForward' | method == 'leapBackward' | method == 'leapSeq'){
tune.grid = data.frame(nvmax = 1:length(feature.names))
}
if (method == 'glmnet' && subopt == 'LASSO'){
# Will only show 1 Lambda value during training, but that is OK
# https://stackoverflow.com/questions/47526544/why-need-to-tune-lambda-with-carettrain-method-glmnet-and-cv-glmnet
# Another option for LASSO is this: https://github.com/topepo/caret/blob/master/RegressionTests/Code/lasso.R
lambda = 10^seq(-2,0, length =100)
alpha = c(1)
tune.grid = expand.grid(alpha = alpha,lambda = lambda)
}
if (method == 'lars'){
# https://github.com/topepo/caret/blob/master/RegressionTests/Code/lars.R
fraction = seq(0, 1, length = 100)
tune.grid = expand.grid(fraction = fraction)
pre.proc = c("center", "scale")
}
}
# http://sshaikh.org/2015/05/06/parallelize-machine-learning-in-r-with-multi-core-cpus/
# #cl <- makeCluster(ceiling(detectCores()*0.5)) # use 75% of cores only, leave rest for other tasks
cl <- makeCluster(detectCores()*0.75) # use 75% of cores only, leave rest for other tasks
registerDoParallel(cl)
set.seed(1)
# note that the seed has to actually be set just before this function is called
# settign is above just not ensure reproducibility for some reason
model.caret <- caret::train(formula
, data = data
, method = method
, tuneGrid = tune.grid
, trControl = train.control
, preProc = pre.proc
)
stopCluster(cl)
registerDoSEQ() # register sequential engine in case you are not using this function anymore
if (method == 'leapForward' | method == 'leapBackward' | method == 'leapSeq'){
print("All models results")
print(model.caret$results) # all model results
print("Best Model")
print(model.caret$bestTune) # best model
model = model.caret$finalModel
# Metrics Plot
dataPlot = model.caret$results %>%
gather(key='metric',value='value',-nvmax) %>%
dplyr::filter(metric %in% c('MAE','RMSE','Rsquared'))
metricsPlot = ggplot(data=dataPlot,aes(x=nvmax,y=value) ) +
geom_line(color='lightblue4') +
geom_point(color='blue',alpha=0.7,size=.9) +
facet_wrap(~metric,ncol=2,scales='free_y')+
theme_light()
plot(metricsPlot)
# Residuals Plot
# leap function does not support studentized residuals
dataPlot=data.frame(pred=predict(model.caret,data),res=resid(model.caret))
residPlot = ggplot(dataPlot,aes(x=pred,y=res)) +
geom_point(color='light blue',alpha=0.7) +
geom_smooth(method="lm")+
theme_light()
plot(residPlot)
residHistogram = ggplot(dataPlot,aes(x=res)) +
geom_histogram(aes(y=..density..),fill='light blue',alpha=1) +
#geom_density(color='lightblue4') +
stat_function(fun = dnorm, n = 100, args = list(mean = mean(dataPlot$res)
, sd = sd(dataPlot$res)),color='lightblue4')
theme_light()
plot(residHistogram)
id = rownames(model.caret$bestTune)
# Provides the coefficients of the best model
# regsubsets doens return a full model (see documentation of regsubset), so we need to recalcualte themodel
# https://stackoverflow.com/questions/13063762/how-to-obtain-a-lm-object-from-regsubsets
print("Coefficients of final model:")
coefs <- coef(model, id=id)
#calculate the model to the the coef intervals
nams <- names(coefs)
nams <- nams[!nams %in% "(Intercept)"]
response <- as.character(formula[[2]])
form <- as.formula(paste(response, paste(nams, collapse = " + "), sep = " ~ "))
mod <- lm(form, data = data)
#coefs
#coef(mod)
print(car::Confint(mod))
return(list(model = model,id = id, residPlot = residPlot, residHistogram=residHistogram
,modelLM=mod))
}
if (method == 'glmnet' && subopt == 'LASSO'){
print(model.caret)
print(plot(model.caret))
print(model.caret$bestTune)
print(model.caret$results)
model=model.caret$finalModel
# Metrics Plot
dataPlot = model.caret$results %>%
gather(key='metric',value='value',-lambda) %>%
dplyr::filter(metric %in% c('MAE','RMSE','Rsquared'))
metricsPlot = ggplot(data=dataPlot,aes(x=lambda,y=value) ) +
geom_line(color='lightblue4') +
geom_point(color='blue',alpha=0.7,size=.9) +
facet_wrap(~metric,ncol=2,scales='free_y')+
theme_light()
plot(metricsPlot)
# Residuals Plot
dataPlot=data.frame(pred=predict(model.caret,data),res=resid(model.caret))
residPlot = ggplot(dataPlot,aes(x=pred,y=res)) +
geom_point(color='light blue',alpha=0.7) +
geom_smooth(method="lm")+
theme_light()
plot(residPlot)
residHistogram = ggplot(dataPlot,aes(x=res)) +
geom_histogram(aes(y=..density..),fill='light blue',alpha=1) +
#geom_density(color='lightblue4') +
stat_function(fun = dnorm, n = 100, args = list(mean = mean(dataPlot$res)
, sd = sd(dataPlot$res)),color='lightblue4')
theme_light()
plot(residHistogram)
print("Coefficients")
#no interval for glmnet: https://stackoverflow.com/questions/39750965/confidence-intervals-for-ridge-regression
t=coef(model,s=model.caret$bestTune$lambda)
model.coef = t[which(t[,1]!=0),]
print(as.data.frame(model.coef))
id = NULL # not really needed but added for consistency
return(list(model = model.caret,id = id, residPlot = residPlot, metricsPlot=metricsPlot ))
}
if (method == 'lars'){
print(model.caret)
print(plot(model.caret))
print(model.caret$bestTune)
# Metrics Plot
dataPlot = model.caret$results %>%
gather(key='metric',value='value',-fraction) %>%
dplyr::filter(metric %in% c('MAE','RMSE','Rsquared'))
metricsPlot = ggplot(data=dataPlot,aes(x=fraction,y=value) ) +
geom_line(color='lightblue4') +
geom_point(color='blue',alpha=0.7,size=.9) +
facet_wrap(~metric,ncol=2,scales='free_y')+
theme_light()
plot(metricsPlot)
# Residuals Plot
dataPlot=data.frame(pred=predict(model.caret,data),res=resid(model.caret))
residPlot = ggplot(dataPlot,aes(x=pred,y=res)) +
geom_point(color='light blue',alpha=0.7) +
geom_smooth(method="lm")+
theme_light()
plot(residPlot)
residHistogram = ggplot(dataPlot,aes(x=res)) +
geom_histogram(aes(y=..density..),fill='light blue',alpha=1) +
#geom_density(color='lightblue4') +
stat_function(fun = dnorm, n = 100, args = list(mean = mean(dataPlot$res)
, sd = sd(dataPlot$res)),color='lightblue4')
theme_light()
plot(residHistogram)
print("Coefficients")
t=coef(model.caret$finalModel,s=model.caret$bestTune$fraction,mode='fraction')
model.coef = t[which(t!=0)]
print(model.coef)
id = NULL # not really needed but added for consistency
return(list(model = model.caret,id = id, residPlot = residPlot, residHistogram=residHistogram))
}
}
# https://stackoverflow.com/questions/48265743/linear-model-subset-selection-goodness-of-fit-with-k-fold-cross-validation
# changed slightly since call[[2]] was just returning "formula" without actually returnign the value in formula
predict.regsubsets <- function(object, newdata, id, formula, ...) {
#form <- as.formula(object$call[[2]])
mat <- model.matrix(formula, newdata) # adds intercept and expands any interaction terms
coefi <- coef(object, id = id)
xvars <- names(coefi)
return(mat[,xvars]%*%coefi)
}
test.model = function(model, test, level=0.95
,draw.limits = FALSE, good = 0.1, ok = 0.15
,method = NULL, subopt = NULL
,id = NULL, formula, feature.names, label.names
,transformation = NULL){
## if using caret for glm select equivalent functionality,
## need to pass formula (full is ok as it will select subset of variables from there)
if (is.null(method)){
pred = predict(model, newdata=test, interval="confidence", level = level)
}
if (method == 'leapForward' | method == 'leapBackward' | method == 'leapSeq'){
pred = predict.regsubsets(model, newdata = test, id = id, formula = formula)
}
if (method == 'glmnet' && subopt == 'LASSO'){
xtest = as.matrix(test[,feature.names])
pred=as.data.frame(predict(model, xtest))
}
if (method == 'lars'){
pred=as.data.frame(predict(model, newdata = test))
}
# Summary of predicted values
print ("Summary of predicted values: ")
print(summary(pred[,1]))
test.mse = mean((test[,label.names]-pred[,1])^2)
print (paste(method, subopt, "Test MSE:", test.mse, sep=" "))
if(log.pred == TRUE || norm.pred == TRUE){
# plot transformewd comparison first
df=data.frame(x=test[,label.names],y=pred[,1])
ggplot(df,aes(x=x,y=y)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
geom_abline(slope=1,intercept=0,color='black',size=1) +
#scale_y_continuous(limits=c(min(df),max(df)))+
xlab("Actual (Transformed)")+
ylab("Predicted (Transformed)")
}
if (log.pred == FALSE && norm.pred == FALSE){
x = test[,label.names]
y = pred[,1]
}
if (log.pred == TRUE){
x = 10^test[,label.names]
y = 10^pred[,1]
}
if (norm.pred == TRUE){
x = predict(transformation, test[,label.names], inverse = TRUE)
y = predict(transformation, pred[,1], inverse = TRUE)
}
df=data.frame(x,y)
ggplot(df,aes(x,y)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
geom_abline(slope=c(1+good,1-good,1+ok,1-ok)
,intercept=rep(0,4),color=c('dark green','dark green','dark red','dark red'),size=1,alpha=0.8) +
#scale_y_continuous(limits=c(min(df),max(df)))+
xlab("Actual")+
ylab("Predicted")
}
n <- names(data.train)
formula <- as.formula(paste(paste(n[n %in% label.names], collapse = " + ")
," ~", paste(n[!n %in% label.names], collapse = " + ")))
grand.mean.formula = as.formula(paste(paste(n[n %in% label.names], collapse = " + ")," ~ 1"))
print(formula)
## y3.log ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 +
## PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 +
## PC19 + PC20 + PC21
print(grand.mean.formula)
## y3.log ~ 1
# Update feature.names because we may have transformed some features
feature.names = n[!n %in% label.names]
model.full = lm(formula , data.train)
summary(model.full)
##
## Call:
## lm(formula = formula, data = data.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.10099 -0.02376 -0.00455 0.01948 0.19126
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.097e+00 4.628e-04 4530.392 < 2e-16 ***
## PC1 2.874e-04 3.102e-05 9.263 < 2e-16 ***
## PC2 -3.749e-04 3.658e-05 -10.248 < 2e-16 ***
## PC3 -8.989e-05 4.378e-05 -2.053 0.040110 *
## PC4 -9.457e-05 4.510e-05 -2.097 0.036050 *
## PC5 -2.370e-04 4.525e-05 -5.238 1.68e-07 ***
## PC6 9.804e-05 4.622e-05 2.121 0.033965 *
## PC7 -1.747e-04 4.689e-05 -3.726 0.000197 ***
## PC8 -1.276e-04 4.812e-05 -2.651 0.008041 **
## PC9 8.338e-05 4.995e-05 1.669 0.095101 .
## PC10 -2.576e-05 5.149e-05 -0.500 0.616920
## PC11 4.798e-04 5.335e-05 8.994 < 2e-16 ***
## PC12 -3.712e-04 5.366e-05 -6.917 5.14e-12 ***
## PC13 2.935e-04 5.537e-05 5.300 1.20e-07 ***
## PC14 5.422e-04 5.606e-05 9.673 < 2e-16 ***
## PC15 -1.241e-04 5.772e-05 -2.150 0.031607 *
## PC16 2.137e-04 6.012e-05 3.554 0.000382 ***
## PC17 -9.632e-05 6.251e-05 -1.541 0.123387
## PC18 -8.540e-05 6.477e-05 -1.319 0.187375
## PC19 -8.936e-05 6.766e-05 -1.321 0.186694
## PC20 4.025e-04 7.248e-05 5.553 2.95e-08 ***
## PC21 -3.429e-04 7.567e-05 -4.531 5.98e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03457 on 5562 degrees of freedom
## Multiple R-squared: 0.0935, Adjusted R-squared: 0.09008
## F-statistic: 27.32 on 21 and 5562 DF, p-value: < 2.2e-16
cd.full = plot.diagnostics(model=model.full, train=data.train)
## [1] "Number of data points that have Cook's D > 4/n: 270"
## [1] "Number of data points that have Cook's D > 1: 0"
high.cd = names(cd.full[cd.full > 4/nrow(data.train)])
#save dataset with high.cd flagged
t = data.train %>%
rownames_to_column() %>%
mutate(high.cd = ifelse(rowname %in% high.cd,1,0))
#write.csv(t,file='data_high_cd_flag.csv',row.names = F)
###
data.train2 = data.train[!(rownames(data.train)) %in% high.cd,]
model.full2 = lm(formula , data.train2)
summary(model.full2)
##
## Call:
## lm(formula = formula, data = data.train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.076433 -0.020875 -0.002849 0.019196 0.085203
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.094e+00 3.952e-04 5298.868 < 2e-16 ***
## PC1 2.697e-04 2.660e-05 10.136 < 2e-16 ***
## PC2 -3.831e-04 3.141e-05 -12.199 < 2e-16 ***
## PC3 -1.364e-04 3.758e-05 -3.630 0.000286 ***
## PC4 -6.954e-05 3.854e-05 -1.804 0.071215 .
## PC5 -2.346e-04 3.883e-05 -6.040 1.64e-09 ***
## PC6 8.715e-05 3.944e-05 2.210 0.027177 *
## PC7 -1.779e-04 4.021e-05 -4.424 9.90e-06 ***
## PC8 -1.058e-04 4.136e-05 -2.558 0.010567 *
## PC9 1.096e-04 4.285e-05 2.558 0.010558 *
## PC10 2.573e-05 4.402e-05 0.585 0.558867
## PC11 4.992e-04 4.558e-05 10.951 < 2e-16 ***
## PC12 -3.876e-04 4.597e-05 -8.430 < 2e-16 ***
## PC13 2.583e-04 4.748e-05 5.441 5.54e-08 ***
## PC14 6.212e-04 4.819e-05 12.892 < 2e-16 ***
## PC15 -1.272e-04 4.948e-05 -2.571 0.010175 *
## PC16 1.897e-04 5.160e-05 3.676 0.000239 ***
## PC17 -6.545e-05 5.338e-05 -1.226 0.220173
## PC18 -7.167e-05 5.569e-05 -1.287 0.198158
## PC19 -1.125e-04 5.800e-05 -1.939 0.052566 .
## PC20 4.207e-04 6.204e-05 6.781 1.32e-11 ***
## PC21 -2.845e-04 6.453e-05 -4.409 1.06e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02879 on 5292 degrees of freedom
## Multiple R-squared: 0.1335, Adjusted R-squared: 0.1301
## F-statistic: 38.83 on 21 and 5292 DF, p-value: < 2.2e-16
cd.full2 = plot.diagnostics(model.full2, data.train2)
## [1] "Number of data points that have Cook's D > 4/n: 202"
## [1] "Number of data points that have Cook's D > 1: 0"
# much more normal residuals than before.
# Checking to see if distributions are different and if so whcih variables
# High Leverage Plot
plotData = data.train %>%
rownames_to_column() %>%
mutate(type=ifelse(rowname %in% high.cd,'High','Normal')) %>%
dplyr::select(type,target=one_of(label.names))
ggplot(data=plotData, aes(x=type,y=target)) +
geom_boxplot(fill='light blue',outlier.shape=NA) +
scale_y_continuous(name="Target Variable Values",label=scales::comma_format(accuracy=.1)) +
theme_light() +
ggtitle('Distribution of High Leverage Points and Normal Points')
# 2 sample t-tests
plotData = data.train %>%
rownames_to_column() %>%
mutate(type=ifelse(rowname %in% high.cd,'High','Normal')) %>%
dplyr::select(type,one_of(feature.names))
comp.test = lapply(dplyr::select(plotData, one_of(feature.names))
, function(x) t.test(x ~ plotData$type, var.equal = TRUE))
sig.comp = list.filter(comp.test, p.value < 0.05)
sapply(sig.comp, function(x) x[['p.value']])
## PC1 PC14
## 1.534155e-06 1.032959e-04
mm = melt(plotData, id=c('type')) %>% filter(variable %in% names(sig.comp))
ggplot(mm,aes(x=type, y=value)) +
geom_boxplot()+
facet_wrap(~variable, ncol=5, scales = 'free_y') +
scale_y_continuous(name="values",label=scales::comma_format(accuracy=.1)) +
ggtitle('Distribution of High Leverage Points and Normal Points')
# Distribution (box) Plots
mm = melt(plotData, id=c('type'))
ggplot(mm,aes(x=type, y=value)) +
geom_boxplot()+
facet_wrap(~variable, ncol=8, scales = 'free_y') +
scale_y_continuous(name="values",label=scales::comma_format(accuracy=.1)) +
ggtitle('Distribution of High Leverage Points and Normal Points')
model.null = lm(grand.mean.formula, data.train)
summary(model.null)
##
## Call:
## lm(formula = grand.mean.formula, data = data.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.114676 -0.023705 -0.003387 0.020847 0.190636
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.096552 0.000485 4323 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03624 on 5583 degrees of freedom
Basic: http://www.stat.columbia.edu/~martin/W2024/R10.pdf Cross Validation + Other Metrics: http://www.sthda.com/english/articles/37-model-selection-essentials-in-r/154-stepwise-regression-essentials-in-r/
if (algo.forward.caret == TRUE){
set.seed(1)
returned = train.caret.glmselect(formula = formula
, data = data.train
, method = "leapForward"
, feature.names = feature.names)
model.forward = returned$model
id = returned$id
}
## Aggregating results
## Selecting tuning parameters
## Fitting nvmax = 21 on full training set
## [1] "All models results"
## nvmax RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 0.03596827 0.01527512 0.02784873 0.0008825615 0.006983967 0.0004973760
## 2 2 0.03575392 0.02743041 0.02768393 0.0008225004 0.012005307 0.0003996057
## 3 3 0.03546814 0.04314125 0.02751325 0.0007761531 0.016301304 0.0003662692
## 4 4 0.03518759 0.05942006 0.02725989 0.0008222871 0.026557851 0.0003374721
## 5 5 0.03504569 0.06691536 0.02714897 0.0008683118 0.026560799 0.0003921311
## 6 6 0.03502016 0.06788875 0.02713994 0.0008683515 0.024237027 0.0003823853
## 7 7 0.03494874 0.07154979 0.02709469 0.0008759187 0.023547787 0.0003737225
## 8 8 0.03485154 0.07691164 0.02701881 0.0008845557 0.025484354 0.0004071258
## 9 9 0.03474539 0.08175798 0.02694702 0.0008798285 0.022240966 0.0004188303
## 10 10 0.03473831 0.08205722 0.02691589 0.0008735658 0.021275990 0.0004221505
## 11 11 0.03467281 0.08545058 0.02687814 0.0008590112 0.021741633 0.0004367692
## 12 12 0.03471105 0.08358349 0.02689703 0.0008296956 0.021825526 0.0004403546
## 13 13 0.03472471 0.08292914 0.02690164 0.0008622382 0.021544881 0.0004531136
## 14 14 0.03471170 0.08360378 0.02689338 0.0008578651 0.021254148 0.0004399639
## 15 15 0.03470650 0.08400074 0.02689739 0.0008440337 0.022149611 0.0004283112
## 16 16 0.03470284 0.08411915 0.02689260 0.0008127207 0.022074747 0.0004163964
## 17 17 0.03468172 0.08517663 0.02688064 0.0008280411 0.021729074 0.0004294478
## 18 18 0.03467684 0.08540811 0.02688313 0.0008302229 0.021636059 0.0004150966
## 19 19 0.03465897 0.08637353 0.02686931 0.0008419497 0.022167629 0.0004303404
## 20 20 0.03463741 0.08749013 0.02685325 0.0008433300 0.022610295 0.0004336634
## 21 21 0.03463457 0.08762411 0.02685159 0.0008402589 0.022592445 0.0004321247
## [1] "Best Model"
## nvmax
## 21 21
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Coefficients of final model:"
## Estimate 2.5 % 97.5 %
## (Intercept) 2.096700e+00 2.095793e+00 2.097608e+00
## PC1 2.873471e-04 2.265370e-04 3.481572e-04
## PC2 -3.749128e-04 -4.466321e-04 -3.031935e-04
## PC3 -8.988881e-05 -1.757190e-04 -4.058607e-06
## PC4 -9.456845e-05 -1.829814e-04 -6.155477e-06
## PC5 -2.370038e-04 -3.257068e-04 -1.483009e-04
## PC6 9.804339e-05 7.424745e-06 1.886620e-04
## PC7 -1.746993e-04 -2.666232e-04 -8.277543e-05
## PC8 -1.275751e-04 -2.219056e-04 -3.324452e-05
## PC9 8.338440e-05 -1.453646e-05 1.813053e-04
## PC10 -2.575589e-05 -1.266890e-04 7.517724e-05
## PC11 4.798348e-04 3.752480e-04 5.844216e-04
## PC12 -3.711748e-04 -4.763764e-04 -2.659731e-04
## PC13 2.934464e-04 1.849049e-04 4.019878e-04
## PC14 5.422023e-04 4.323119e-04 6.520927e-04
## PC15 -1.240986e-04 -2.372591e-04 -1.093813e-05
## PC16 2.136898e-04 9.582792e-05 3.315516e-04
## PC17 -9.631725e-05 -2.188520e-04 2.621754e-05
## PC18 -8.540462e-05 -2.123825e-04 4.157327e-05
## PC19 -8.935724e-05 -2.220067e-04 4.329220e-05
## PC20 4.024553e-04 2.603635e-04 5.445472e-04
## PC21 -3.428740e-04 -4.912106e-04 -1.945375e-04
if (algo.forward.caret == TRUE){
test.model(model=model.forward, test=data.test
,method = 'leapForward',subopt = NULL
,formula = formula, feature.names = feature.names, label.names = label.names
,id = id
,draw.limits = TRUE, transformation = t)
}
## [1] "Summary of predicted values: "
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.066 2.089 2.097 2.097 2.105 2.137
## [1] "leapForward Test MSE: 0.00114591004507159"
if (algo.backward.caret == TRUE){
set.seed(1)
returned = train.caret.glmselect(formula = formula
,data = data.train
,method = "leapBackward"
,feature.names = feature.names)
model.backward = returned$model
id = returned$id
}
## Aggregating results
## Selecting tuning parameters
## Fitting nvmax = 21 on full training set
## [1] "All models results"
## nvmax RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 0.03596827 0.01527512 0.02784873 0.0008825615 0.006983967 0.0004973760
## 2 2 0.03575392 0.02743041 0.02768393 0.0008225004 0.012005307 0.0003996057
## 3 3 0.03546814 0.04314125 0.02751325 0.0007761531 0.016301304 0.0003662692
## 4 4 0.03518759 0.05942006 0.02725989 0.0008222871 0.026557851 0.0003374721
## 5 5 0.03504569 0.06691536 0.02714897 0.0008683118 0.026560799 0.0003921311
## 6 6 0.03502016 0.06788875 0.02713994 0.0008683515 0.024237027 0.0003823853
## 7 7 0.03494874 0.07154979 0.02709469 0.0008759187 0.023547787 0.0003737225
## 8 8 0.03485154 0.07691164 0.02701881 0.0008845557 0.025484354 0.0004071258
## 9 9 0.03474539 0.08175798 0.02694702 0.0008798285 0.022240966 0.0004188303
## 10 10 0.03473831 0.08205722 0.02691589 0.0008735658 0.021275990 0.0004221505
## 11 11 0.03467281 0.08545058 0.02687814 0.0008590112 0.021741633 0.0004367692
## 12 12 0.03471105 0.08358349 0.02689703 0.0008296956 0.021825526 0.0004403546
## 13 13 0.03471655 0.08329772 0.02689755 0.0008615133 0.021391394 0.0004512433
## 14 14 0.03470989 0.08369140 0.02689052 0.0008578053 0.021226466 0.0004390919
## 15 15 0.03470650 0.08400074 0.02689739 0.0008440337 0.022149611 0.0004283112
## 16 16 0.03470284 0.08411915 0.02689260 0.0008127207 0.022074747 0.0004163964
## 17 17 0.03468172 0.08517663 0.02688064 0.0008280411 0.021729074 0.0004294478
## 18 18 0.03467684 0.08540811 0.02688313 0.0008302229 0.021636059 0.0004150966
## 19 19 0.03465897 0.08637353 0.02686931 0.0008419497 0.022167629 0.0004303404
## 20 20 0.03463741 0.08749013 0.02685325 0.0008433300 0.022610295 0.0004336634
## 21 21 0.03463457 0.08762411 0.02685159 0.0008402589 0.022592445 0.0004321247
## [1] "Best Model"
## nvmax
## 21 21
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Coefficients of final model:"
## Estimate 2.5 % 97.5 %
## (Intercept) 2.096700e+00 2.095793e+00 2.097608e+00
## PC1 2.873471e-04 2.265370e-04 3.481572e-04
## PC2 -3.749128e-04 -4.466321e-04 -3.031935e-04
## PC3 -8.988881e-05 -1.757190e-04 -4.058607e-06
## PC4 -9.456845e-05 -1.829814e-04 -6.155477e-06
## PC5 -2.370038e-04 -3.257068e-04 -1.483009e-04
## PC6 9.804339e-05 7.424745e-06 1.886620e-04
## PC7 -1.746993e-04 -2.666232e-04 -8.277543e-05
## PC8 -1.275751e-04 -2.219056e-04 -3.324452e-05
## PC9 8.338440e-05 -1.453646e-05 1.813053e-04
## PC10 -2.575589e-05 -1.266890e-04 7.517724e-05
## PC11 4.798348e-04 3.752480e-04 5.844216e-04
## PC12 -3.711748e-04 -4.763764e-04 -2.659731e-04
## PC13 2.934464e-04 1.849049e-04 4.019878e-04
## PC14 5.422023e-04 4.323119e-04 6.520927e-04
## PC15 -1.240986e-04 -2.372591e-04 -1.093813e-05
## PC16 2.136898e-04 9.582792e-05 3.315516e-04
## PC17 -9.631725e-05 -2.188520e-04 2.621754e-05
## PC18 -8.540462e-05 -2.123825e-04 4.157327e-05
## PC19 -8.935724e-05 -2.220067e-04 4.329220e-05
## PC20 4.024553e-04 2.603635e-04 5.445472e-04
## PC21 -3.428740e-04 -4.912106e-04 -1.945375e-04
if (algo.backward.caret == TRUE){
test.model(model.backward, data.test
,method = 'leapBackward',subopt = NULL
,formula = formula, feature.names = feature.names, label.names = label.names
,id = id
,draw.limits = TRUE, transformation = t)
}
## [1] "Summary of predicted values: "
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.066 2.089 2.097 2.097 2.105 2.137
## [1] "leapBackward Test MSE: 0.00114591004507159"
if (algo.stepwise.caret == TRUE){
set.seed(1)
returned = train.caret.glmselect(formula = formula
,data = data.train
,method = "leapSeq"
,feature.names = feature.names)
model.stepwise = returned$model
id = returned$id
}
## Aggregating results
## Selecting tuning parameters
## Fitting nvmax = 21 on full training set
## [1] "All models results"
## nvmax RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 0.03596827 0.01527512 0.02784873 0.0008825615 0.006983967 0.0004973760
## 2 2 0.03575392 0.02743041 0.02768393 0.0008225004 0.012005307 0.0003996057
## 3 3 0.03559100 0.03645637 0.02760432 0.0007565589 0.015046580 0.0003491449
## 4 4 0.03521623 0.05820690 0.02725501 0.0008120500 0.026133041 0.0003377191
## 5 5 0.03504569 0.06691536 0.02714897 0.0008683118 0.026560799 0.0003921311
## 6 6 0.03502016 0.06788875 0.02713994 0.0008683515 0.024237027 0.0003823853
## 7 7 0.03494874 0.07154979 0.02709469 0.0008759187 0.023547787 0.0003737225
## 8 8 0.03485154 0.07691164 0.02701881 0.0008845557 0.025484354 0.0004071258
## 9 9 0.03474539 0.08175798 0.02694702 0.0008798285 0.022240966 0.0004188303
## 10 10 0.03473831 0.08205722 0.02691589 0.0008735658 0.021275990 0.0004221505
## 11 11 0.03467281 0.08545058 0.02687814 0.0008590112 0.021741633 0.0004367692
## 12 12 0.03480051 0.07838011 0.02698333 0.0007736157 0.022396497 0.0003830715
## 13 13 0.03479003 0.07967316 0.02698113 0.0010144558 0.025800714 0.0005416168
## 14 14 0.03470861 0.08427002 0.02685605 0.0008561926 0.022475789 0.0003582521
## 15 15 0.03472035 0.08370827 0.02688634 0.0008174997 0.022799803 0.0003234209
## 16 16 0.03473846 0.08240525 0.02693034 0.0008690334 0.023843643 0.0004312217
## 17 17 0.03472856 0.08272831 0.02693944 0.0008762592 0.022490018 0.0004255057
## 18 18 0.03470802 0.08365234 0.02687973 0.0008204037 0.023552122 0.0004172611
## 19 19 0.03471436 0.08330441 0.02689963 0.0007927295 0.019590131 0.0003959860
## 20 20 0.03464868 0.08701104 0.02683826 0.0008400387 0.023194229 0.0003896163
## 21 21 0.03463457 0.08762411 0.02685159 0.0008402589 0.022592445 0.0004321247
## [1] "Best Model"
## nvmax
## 21 21
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Coefficients of final model:"
## Estimate 2.5 % 97.5 %
## (Intercept) 2.096700e+00 2.095793e+00 2.097608e+00
## PC1 2.873471e-04 2.265370e-04 3.481572e-04
## PC2 -3.749128e-04 -4.466321e-04 -3.031935e-04
## PC3 -8.988881e-05 -1.757190e-04 -4.058607e-06
## PC4 -9.456845e-05 -1.829814e-04 -6.155477e-06
## PC5 -2.370038e-04 -3.257068e-04 -1.483009e-04
## PC6 9.804339e-05 7.424745e-06 1.886620e-04
## PC7 -1.746993e-04 -2.666232e-04 -8.277543e-05
## PC8 -1.275751e-04 -2.219056e-04 -3.324452e-05
## PC9 8.338440e-05 -1.453646e-05 1.813053e-04
## PC10 -2.575589e-05 -1.266890e-04 7.517724e-05
## PC11 4.798348e-04 3.752480e-04 5.844216e-04
## PC12 -3.711748e-04 -4.763764e-04 -2.659731e-04
## PC13 2.934464e-04 1.849049e-04 4.019878e-04
## PC14 5.422023e-04 4.323119e-04 6.520927e-04
## PC15 -1.240986e-04 -2.372591e-04 -1.093813e-05
## PC16 2.136898e-04 9.582792e-05 3.315516e-04
## PC17 -9.631725e-05 -2.188520e-04 2.621754e-05
## PC18 -8.540462e-05 -2.123825e-04 4.157327e-05
## PC19 -8.935724e-05 -2.220067e-04 4.329220e-05
## PC20 4.024553e-04 2.603635e-04 5.445472e-04
## PC21 -3.428740e-04 -4.912106e-04 -1.945375e-04
if (algo.stepwise.caret == TRUE){
test.model(model.stepwise, data.test
,method = 'leapSeq',subopt = NULL
,formula = formula, feature.names = feature.names, label.names = label.names
,id = id
,draw.limits = TRUE, transformation = t)
}
## [1] "Summary of predicted values: "
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.066 2.089 2.097 2.097 2.105 2.137
## [1] "leapSeq Test MSE: 0.00114591004507159"
if (algo.LASSO.caret == TRUE){
set.seed(1)
tune.grid= expand.grid(alpha = 1,lambda = 10^seq(from=-4,to=-2,length=100))
returned = train.caret.glmselect(formula = formula
,data = data.train
,method = "glmnet"
,subopt = 'LASSO'
,tune.grid = tune.grid
,feature.names = feature.names)
model.LASSO.caret = returned$model
}
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, : There were missing values in resampled
## performance measures.
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 1, lambda = 1e-04 on full training set
## glmnet
##
## 5584 samples
## 21 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 5026, 5026, 5026, 5025, 5025, 5026, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0.0001000000 0.03463703 0.087359043 0.02685258
## 0.0001047616 0.03463727 0.087345170 0.02685271
## 0.0001097499 0.03463753 0.087330343 0.02685285
## 0.0001149757 0.03463783 0.087314464 0.02685300
## 0.0001204504 0.03463815 0.087297441 0.02685316
## 0.0001261857 0.03463851 0.087279191 0.02685333
## 0.0001321941 0.03463890 0.087259610 0.02685353
## 0.0001384886 0.03463934 0.087238584 0.02685379
## 0.0001450829 0.03463983 0.087215987 0.02685409
## 0.0001519911 0.03464037 0.087191677 0.02685443
## 0.0001592283 0.03464096 0.087165484 0.02685482
## 0.0001668101 0.03464161 0.087137412 0.02685526
## 0.0001747528 0.03464233 0.087107650 0.02685574
## 0.0001830738 0.03464311 0.087075625 0.02685631
## 0.0001917910 0.03464397 0.087041382 0.02685693
## 0.0002009233 0.03464491 0.087005020 0.02685764
## 0.0002104904 0.03464592 0.086966625 0.02685845
## 0.0002205131 0.03464704 0.086925419 0.02685933
## 0.0002310130 0.03464827 0.086881010 0.02686032
## 0.0002420128 0.03464962 0.086832898 0.02686143
## 0.0002535364 0.03465111 0.086780550 0.02686265
## 0.0002656088 0.03465276 0.086723529 0.02686401
## 0.0002782559 0.03465458 0.086661324 0.02686547
## 0.0002915053 0.03465659 0.086593109 0.02686700
## 0.0003053856 0.03465881 0.086518549 0.02686868
## 0.0003199267 0.03466126 0.086436521 0.02687052
## 0.0003351603 0.03466396 0.086346730 0.02687248
## 0.0003511192 0.03466690 0.086250381 0.02687455
## 0.0003678380 0.03467010 0.086146135 0.02687686
## 0.0003853529 0.03467365 0.086030837 0.02687960
## 0.0004037017 0.03467746 0.085908414 0.02688256
## 0.0004229243 0.03468159 0.085777982 0.02688584
## 0.0004430621 0.03468579 0.085652735 0.02688935
## 0.0004641589 0.03469044 0.085514014 0.02689323
## 0.0004862602 0.03469554 0.085361273 0.02689747
## 0.0005094138 0.03470103 0.085199251 0.02690193
## 0.0005336699 0.03470707 0.085021311 0.02690682
## 0.0005590810 0.03471359 0.084832044 0.02691202
## 0.0005857021 0.03472059 0.084631700 0.02691764
## 0.0006135907 0.03472796 0.084430949 0.02692355
## 0.0006428073 0.03473586 0.084218832 0.02692982
## 0.0006734151 0.03474413 0.084007860 0.02693636
## 0.0007054802 0.03475302 0.083785874 0.02694324
## 0.0007390722 0.03476264 0.083546164 0.02695059
## 0.0007742637 0.03477238 0.083323501 0.02695828
## 0.0008111308 0.03478304 0.083076117 0.02696646
## 0.0008497534 0.03479387 0.082854257 0.02697481
## 0.0008902151 0.03480536 0.082628277 0.02698365
## 0.0009326033 0.03481770 0.082389181 0.02699272
## 0.0009770100 0.03483084 0.082143453 0.02700211
## 0.0010235310 0.03484493 0.081881915 0.02701177
## 0.0010722672 0.03486042 0.081583330 0.02702297
## 0.0011233240 0.03487748 0.081232460 0.02703533
## 0.0011768120 0.03489639 0.080814484 0.02704901
## 0.0012328467 0.03491734 0.080318146 0.02706411
## 0.0012915497 0.03494034 0.079741859 0.02708030
## 0.0013530478 0.03496569 0.079062976 0.02709821
## 0.0014174742 0.03499324 0.078293389 0.02711779
## 0.0014849683 0.03502329 0.077407044 0.02713937
## 0.0015556761 0.03505481 0.076500472 0.02716275
## 0.0016297508 0.03508809 0.075524132 0.02718736
## 0.0017073526 0.03512359 0.074446770 0.02721313
## 0.0017886495 0.03516127 0.073266874 0.02724030
## 0.0018738174 0.03520064 0.072017434 0.02726842
## 0.0019630407 0.03524334 0.070537574 0.02729812
## 0.0020565123 0.03528900 0.068843755 0.02732962
## 0.0021544347 0.03533469 0.067198747 0.02736148
## 0.0022570197 0.03538118 0.065549304 0.02739513
## 0.0023644894 0.03542700 0.064067357 0.02742798
## 0.0024770764 0.03547175 0.062830651 0.02746012
## 0.0025950242 0.03551618 0.061776548 0.02749222
## 0.0027185882 0.03556270 0.060651592 0.02752526
## 0.0028480359 0.03561289 0.059262400 0.02756095
## 0.0029836472 0.03566668 0.057512941 0.02759911
## 0.0031257158 0.03572423 0.055315775 0.02764041
## 0.0032745492 0.03578244 0.053066963 0.02768258
## 0.0034304693 0.03584236 0.050587558 0.02772655
## 0.0035938137 0.03590562 0.047509634 0.02777273
## 0.0037649358 0.03597249 0.043353183 0.02782201
## 0.0039442061 0.03603933 0.038566170 0.02787210
## 0.0041320124 0.03610273 0.033041610 0.02791944
## 0.0043287613 0.03615541 0.027681475 0.02796030
## 0.0045348785 0.03619548 0.022063016 0.02799110
## 0.0047508102 0.03621909 0.011479759 0.02800872
## 0.0049770236 0.03622570 0.005554303 0.02801454
## 0.0052140083 0.03622572 NaN 0.02801456
## 0.0054622772 0.03622572 NaN 0.02801456
## 0.0057223677 0.03622572 NaN 0.02801456
## 0.0059948425 0.03622572 NaN 0.02801456
## 0.0062802914 0.03622572 NaN 0.02801456
## 0.0065793322 0.03622572 NaN 0.02801456
## 0.0068926121 0.03622572 NaN 0.02801456
## 0.0072208090 0.03622572 NaN 0.02801456
## 0.0075646333 0.03622572 NaN 0.02801456
## 0.0079248290 0.03622572 NaN 0.02801456
## 0.0083021757 0.03622572 NaN 0.02801456
## 0.0086974900 0.03622572 NaN 0.02801456
## 0.0091116276 0.03622572 NaN 0.02801456
## 0.0095454846 0.03622572 NaN 0.02801456
## 0.0100000000 0.03622572 NaN 0.02801456
##
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 1e-04.
## alpha lambda
## 1 1 1e-04
## alpha lambda RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 1 0.0001000000 0.03463703 0.087359043 0.02685258 0.0008420596 0.022619383 0.0004244388
## 2 1 0.0001047616 0.03463727 0.087345170 0.02685271 0.0008421311 0.022622631 0.0004241070
## 3 1 0.0001097499 0.03463753 0.087330343 0.02685285 0.0008422066 0.022626026 0.0004237570
## 4 1 0.0001149757 0.03463783 0.087314464 0.02685300 0.0008422862 0.022629573 0.0004233923
## 5 1 0.0001204504 0.03463815 0.087297441 0.02685316 0.0008423703 0.022633286 0.0004230122
## 6 1 0.0001261857 0.03463851 0.087279191 0.02685333 0.0008424591 0.022637170 0.0004226161
## 7 1 0.0001321941 0.03463890 0.087259610 0.02685353 0.0008425530 0.022641232 0.0004222176
## 8 1 0.0001384886 0.03463934 0.087238584 0.02685379 0.0008426522 0.022645480 0.0004217868
## 9 1 0.0001450829 0.03463983 0.087215987 0.02685409 0.0008427572 0.022649921 0.0004213382
## 10 1 0.0001519911 0.03464037 0.087191677 0.02685443 0.0008428681 0.022654562 0.0004208784
## 11 1 0.0001592283 0.03464096 0.087165484 0.02685482 0.0008429855 0.022659409 0.0004203654
## 12 1 0.0001668101 0.03464161 0.087137412 0.02685526 0.0008431104 0.022664506 0.0004198135
## 13 1 0.0001747528 0.03464233 0.087107650 0.02685574 0.0008432455 0.022669940 0.0004192242
## 14 1 0.0001830738 0.03464311 0.087075625 0.02685631 0.0008433891 0.022675632 0.0004186347
## 15 1 0.0001917910 0.03464397 0.087041382 0.02685693 0.0008435437 0.022681618 0.0004180517
## 16 1 0.0002009233 0.03464491 0.087005020 0.02685764 0.0008436984 0.022687588 0.0004175059
## 17 1 0.0002104904 0.03464592 0.086966625 0.02685845 0.0008438638 0.022694105 0.0004169886
## 18 1 0.0002205131 0.03464704 0.086925419 0.02685933 0.0008440372 0.022700557 0.0004164807
## 19 1 0.0002310130 0.03464827 0.086881010 0.02686032 0.0008442181 0.022707062 0.0004159053
## 20 1 0.0002420128 0.03464962 0.086832898 0.02686143 0.0008444104 0.022713743 0.0004153125
## 21 1 0.0002535364 0.03465111 0.086780550 0.02686265 0.0008446147 0.022720685 0.0004147131
## 22 1 0.0002656088 0.03465276 0.086723529 0.02686401 0.0008448317 0.022727890 0.0004140713
## 23 1 0.0002782559 0.03465458 0.086661324 0.02686547 0.0008450621 0.022735333 0.0004133708
## 24 1 0.0002915053 0.03465659 0.086593109 0.02686700 0.0008453035 0.022742672 0.0004126494
## 25 1 0.0003053856 0.03465881 0.086518549 0.02686868 0.0008455602 0.022750246 0.0004118587
## 26 1 0.0003199267 0.03466126 0.086436521 0.02687052 0.0008458283 0.022757527 0.0004110060
## 27 1 0.0003351603 0.03466396 0.086346730 0.02687248 0.0008461146 0.022765083 0.0004101288
## 28 1 0.0003511192 0.03466690 0.086250381 0.02687455 0.0008462949 0.022770778 0.0004092210
## 29 1 0.0003678380 0.03467010 0.086146135 0.02687686 0.0008464147 0.022774971 0.0004083365
## 30 1 0.0003853529 0.03467365 0.086030837 0.02687960 0.0008464088 0.022779949 0.0004075101
## 31 1 0.0004037017 0.03467746 0.085908414 0.02688256 0.0008463364 0.022780696 0.0004066698
## 32 1 0.0004229243 0.03468159 0.085777982 0.02688584 0.0008462775 0.022780629 0.0004057825
## 33 1 0.0004430621 0.03468579 0.085652735 0.02688935 0.0008462494 0.022776940 0.0004048295
## 34 1 0.0004641589 0.03469044 0.085514014 0.02689323 0.0008462268 0.022771759 0.0004039119
## 35 1 0.0004862602 0.03469554 0.085361273 0.02689747 0.0008462182 0.022765303 0.0004029915
## 36 1 0.0005094138 0.03470103 0.085199251 0.02690193 0.0008463440 0.022758071 0.0004020680
## 37 1 0.0005336699 0.03470707 0.085021311 0.02690682 0.0008465937 0.022757325 0.0004012301
## 38 1 0.0005590810 0.03471359 0.084832044 0.02691202 0.0008470244 0.022758301 0.0004006059
## 39 1 0.0005857021 0.03472059 0.084631700 0.02691764 0.0008476026 0.022762950 0.0004002461
## 40 1 0.0006135907 0.03472796 0.084430949 0.02692355 0.0008485035 0.022780759 0.0003999278
## 41 1 0.0006428073 0.03473586 0.084218832 0.02692982 0.0008494313 0.022801554 0.0003995926
## 42 1 0.0006734151 0.03474413 0.084007860 0.02693636 0.0008505118 0.022840498 0.0003992681
## 43 1 0.0007054802 0.03475302 0.083785874 0.02694324 0.0008517523 0.022895115 0.0003990897
## 44 1 0.0007390722 0.03476264 0.083546164 0.02695059 0.0008530197 0.022964488 0.0003988119
## 45 1 0.0007742637 0.03477238 0.083323501 0.02695828 0.0008542425 0.023003440 0.0003983118
## 46 1 0.0008111308 0.03478304 0.083076117 0.02696646 0.0008555720 0.023054711 0.0003979884
## 47 1 0.0008497534 0.03479387 0.082854257 0.02697481 0.0008569407 0.023104132 0.0003977733
## 48 1 0.0008902151 0.03480536 0.082628277 0.02698365 0.0008582919 0.023166446 0.0003980287
## 49 1 0.0009326033 0.03481770 0.082389181 0.02699272 0.0008596553 0.023238239 0.0003985172
## 50 1 0.0009770100 0.03483084 0.082143453 0.02700211 0.0008610950 0.023315357 0.0003990204
## 51 1 0.0010235310 0.03484493 0.081881915 0.02701177 0.0008625642 0.023379726 0.0003999570
## 52 1 0.0010722672 0.03486042 0.081583330 0.02702297 0.0008638880 0.023469548 0.0004001190
## 53 1 0.0011233240 0.03487748 0.081232460 0.02703533 0.0008652653 0.023555880 0.0004001307
## 54 1 0.0011768120 0.03489639 0.080814484 0.02704901 0.0008666338 0.023652274 0.0003995515
## 55 1 0.0012328467 0.03491734 0.080318146 0.02706411 0.0008682165 0.023763816 0.0003989079
## 56 1 0.0012915497 0.03494034 0.079741859 0.02708030 0.0008693434 0.023864159 0.0003981435
## 57 1 0.0013530478 0.03496569 0.079062976 0.02709821 0.0008701321 0.023966965 0.0003977875
## 58 1 0.0014174742 0.03499324 0.078293389 0.02711779 0.0008705598 0.024083062 0.0003974366
## 59 1 0.0014849683 0.03502329 0.077407044 0.02713937 0.0008705724 0.024245220 0.0003972179
## 60 1 0.0015556761 0.03505481 0.076500472 0.02716275 0.0008705427 0.024551475 0.0003975565
## 61 1 0.0016297508 0.03508809 0.075524132 0.02718736 0.0008705758 0.024840409 0.0003980537
## 62 1 0.0017073526 0.03512359 0.074446770 0.02721313 0.0008705662 0.025108521 0.0003982154
## 63 1 0.0017886495 0.03516127 0.073266874 0.02724030 0.0008706354 0.025357414 0.0003986238
## 64 1 0.0018738174 0.03520064 0.072017434 0.02726842 0.0008728679 0.025505591 0.0004017743
## 65 1 0.0019630407 0.03524334 0.070537574 0.02729812 0.0008753825 0.025638236 0.0004055445
## 66 1 0.0020565123 0.03528900 0.068843755 0.02732962 0.0008776996 0.025723867 0.0004102578
## 67 1 0.0021544347 0.03533469 0.067198747 0.02736148 0.0008805808 0.025683090 0.0004161168
## 68 1 0.0022570197 0.03538118 0.065549304 0.02739513 0.0008846185 0.025638557 0.0004232165
## 69 1 0.0023644894 0.03542700 0.064067357 0.02742798 0.0008899850 0.025491109 0.0004314162
## 70 1 0.0024770764 0.03547175 0.062830651 0.02746012 0.0008949340 0.025352735 0.0004391349
## 71 1 0.0025950242 0.03551618 0.061776548 0.02749222 0.0008994959 0.024973736 0.0004458268
## 72 1 0.0027185882 0.03556270 0.060651592 0.02752526 0.0009037561 0.024526613 0.0004521449
## 73 1 0.0028480359 0.03561289 0.059262400 0.02756095 0.0009080658 0.023863188 0.0004580850
## 74 1 0.0029836472 0.03566668 0.057512941 0.02759911 0.0009133997 0.022861027 0.0004650746
## 75 1 0.0031257158 0.03572423 0.055315775 0.02764041 0.0009192965 0.021538093 0.0004717841
## 76 1 0.0032745492 0.03578244 0.053066963 0.02768258 0.0009277199 0.019758829 0.0004795661
## 77 1 0.0034304693 0.03584236 0.050587558 0.02772655 0.0009363006 0.017421226 0.0004866999
## 78 1 0.0035938137 0.03590562 0.047509634 0.02777273 0.0009433884 0.014994242 0.0004924287
## 79 1 0.0037649358 0.03597249 0.043353183 0.02782201 0.0009484030 0.012746132 0.0004970765
## 80 1 0.0039442061 0.03603933 0.038566170 0.02787210 0.0009517570 0.012024211 0.0004993445
## 81 1 0.0041320124 0.03610273 0.033041610 0.02791944 0.0009548843 0.012366545 0.0005009790
## 82 1 0.0043287613 0.03615541 0.027681475 0.02796030 0.0009600741 0.013024187 0.0005036471
## 83 1 0.0045348785 0.03619548 0.022063016 0.02799110 0.0009632621 0.014021386 0.0005044522
## 84 1 0.0047508102 0.03621909 0.011479759 0.02800872 0.0009588895 0.005696682 0.0005001673
## 85 1 0.0049770236 0.03622570 0.005554303 0.02801454 0.0009538584 NA 0.0004960874
## 86 1 0.0052140083 0.03622572 NaN 0.02801456 0.0009538310 NA 0.0004960658
## 87 1 0.0054622772 0.03622572 NaN 0.02801456 0.0009538310 NA 0.0004960658
## 88 1 0.0057223677 0.03622572 NaN 0.02801456 0.0009538310 NA 0.0004960658
## 89 1 0.0059948425 0.03622572 NaN 0.02801456 0.0009538310 NA 0.0004960658
## 90 1 0.0062802914 0.03622572 NaN 0.02801456 0.0009538310 NA 0.0004960658
## 91 1 0.0065793322 0.03622572 NaN 0.02801456 0.0009538310 NA 0.0004960658
## 92 1 0.0068926121 0.03622572 NaN 0.02801456 0.0009538310 NA 0.0004960658
## 93 1 0.0072208090 0.03622572 NaN 0.02801456 0.0009538310 NA 0.0004960658
## 94 1 0.0075646333 0.03622572 NaN 0.02801456 0.0009538310 NA 0.0004960658
## 95 1 0.0079248290 0.03622572 NaN 0.02801456 0.0009538310 NA 0.0004960658
## 96 1 0.0083021757 0.03622572 NaN 0.02801456 0.0009538310 NA 0.0004960658
## 97 1 0.0086974900 0.03622572 NaN 0.02801456 0.0009538310 NA 0.0004960658
## 98 1 0.0091116276 0.03622572 NaN 0.02801456 0.0009538310 NA 0.0004960658
## 99 1 0.0095454846 0.03622572 NaN 0.02801456 0.0009538310 NA 0.0004960658
## 100 1 0.0100000000 0.03622572 NaN 0.02801456 0.0009538310 NA 0.0004960658
## Warning: Removed 15 rows containing missing values (geom_path).
## Warning: Removed 15 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Coefficients"
## model.coef
## (Intercept) 2.096695e+00
## PC1 2.805726e-04
## PC2 -3.669792e-04
## PC3 -8.033306e-05
## PC4 -8.513457e-05
## PC5 -2.271190e-04
## PC6 8.832998e-05
## PC7 -1.645120e-04
## PC8 -1.172464e-04
## PC9 7.331013e-05
## PC10 -1.460581e-05
## PC11 4.676647e-04
## PC12 -3.597023e-04
## PC13 2.807489e-04
## PC14 5.300203e-04
## PC15 -1.115144e-04
## PC16 2.002768e-04
## PC17 -8.295789e-05
## PC18 -7.219463e-05
## PC19 -7.490273e-05
## PC20 3.861752e-04
## PC21 -3.263964e-04
if (algo.LASSO.caret == TRUE){
test.model(model.LASSO.caret, data.test
,method = 'glmnet',subopt = "LASSO"
,formula = formula, feature.names = feature.names, label.names = label.names
,draw.limits = TRUE, transformation = t)
}
## [1] "Summary of predicted values: "
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.067 2.089 2.097 2.097 2.105 2.136
## [1] "glmnet LASSO Test MSE: 0.00114545205184667"
if (algo.LARS.caret == TRUE){
set.seed(1)
returned = train.caret.glmselect(formula = formula
,data = data.train
,method = "lars"
,subopt = 'NULL'
,feature.names = feature.names)
model.LARS.caret = returned$model
}
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, : There were missing values in resampled
## performance measures.
## Aggregating results
## Selecting tuning parameters
## Fitting fraction = 0.99 on full training set
## Least Angle Regression
##
## 5584 samples
## 21 predictor
##
## Pre-processing: centered (21), scaled (21)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 5026, 5026, 5026, 5025, 5025, 5026, ...
## Resampling results across tuning parameters:
##
## fraction RMSE Rsquared MAE
## 0.00000000 0.03622572 NaN 0.02801456
## 0.01010101 0.03617935 0.02214736 0.02797972
## 0.02020202 0.03613459 0.02806347 0.02794487
## 0.03030303 0.03609109 0.03378642 0.02791147
## 0.04040404 0.03604783 0.03765234 0.02787909
## 0.05050505 0.03600372 0.04136539 0.02784667
## 0.06060606 0.03595923 0.04516500 0.02781355
## 0.07070707 0.03591594 0.04793537 0.02778172
## 0.08080808 0.03587384 0.04999887 0.02775132
## 0.09090909 0.03583357 0.05167235 0.02772205
## 0.10101010 0.03579551 0.05318279 0.02769380
## 0.11111111 0.03575872 0.05444235 0.02766651
## 0.12121212 0.03572269 0.05568819 0.02764016
## 0.13131313 0.03568714 0.05701536 0.02761456
## 0.14141414 0.03565247 0.05817530 0.02759017
## 0.15151515 0.03561798 0.05926023 0.02756562
## 0.16161616 0.03558472 0.06019836 0.02754181
## 0.17171717 0.03555363 0.06099958 0.02751958
## 0.18181818 0.03552456 0.06167948 0.02749902
## 0.19191919 0.03549660 0.06224718 0.02747897
## 0.20202020 0.03546953 0.06279825 0.02745931
## 0.21212121 0.03544333 0.06346858 0.02744014
## 0.22222222 0.03541793 0.06425287 0.02742152
## 0.23232323 0.03539307 0.06507763 0.02740381
## 0.24242424 0.03536832 0.06593395 0.02738616
## 0.25252525 0.03534367 0.06686548 0.02736882
## 0.26262626 0.03531920 0.06781345 0.02735176
## 0.27272727 0.03529525 0.06870694 0.02733499
## 0.28282828 0.03527158 0.06956881 0.02731842
## 0.29292929 0.03524838 0.07040542 0.02730225
## 0.30303030 0.03522570 0.07118960 0.02728642
## 0.31313131 0.03520363 0.07193930 0.02727100
## 0.32323232 0.03518210 0.07263681 0.02725587
## 0.33333333 0.03516125 0.07327738 0.02724120
## 0.34343434 0.03514068 0.07390801 0.02722641
## 0.35353535 0.03512055 0.07454128 0.02721195
## 0.36363636 0.03510115 0.07513484 0.02719784
## 0.37373737 0.03508237 0.07568701 0.02718412
## 0.38383838 0.03506398 0.07622739 0.02717044
## 0.39393939 0.03504597 0.07674819 0.02715693
## 0.40404040 0.03502842 0.07724006 0.02714397
## 0.41414141 0.03501099 0.07777576 0.02713160
## 0.42424242 0.03499428 0.07827368 0.02711956
## 0.43434343 0.03497814 0.07873312 0.02710776
## 0.44444444 0.03496259 0.07915487 0.02709655
## 0.45454545 0.03494749 0.07954785 0.02708571
## 0.46464646 0.03493267 0.07994837 0.02707535
## 0.47474747 0.03491852 0.08031120 0.02706542
## 0.48484848 0.03490514 0.08063665 0.02705585
## 0.49494949 0.03489275 0.08092279 0.02704665
## 0.50505051 0.03488085 0.08119212 0.02703785
## 0.51515152 0.03486948 0.08143672 0.02702940
## 0.52525253 0.03485898 0.08164734 0.02702176
## 0.53535354 0.03484929 0.08183418 0.02701486
## 0.54545455 0.03484037 0.08198670 0.02700860
## 0.55555556 0.03483182 0.08213194 0.02700270
## 0.56565657 0.03482359 0.08226644 0.02699707
## 0.57575758 0.03481537 0.08241458 0.02699125
## 0.58585859 0.03480740 0.08256137 0.02698551
## 0.59595960 0.03479954 0.08272077 0.02697964
## 0.60606061 0.03479210 0.08286400 0.02697404
## 0.61616162 0.03478478 0.08301543 0.02696838
## 0.62626263 0.03477743 0.08318768 0.02696266
## 0.63636364 0.03477037 0.08335075 0.02695716
## 0.64646465 0.03476323 0.08352538 0.02695160
## 0.65656566 0.03475635 0.08369005 0.02694618
## 0.66666667 0.03474970 0.08385280 0.02694112
## 0.67676768 0.03474331 0.08400884 0.02693615
## 0.68686869 0.03473694 0.08417655 0.02693117
## 0.69696970 0.03473076 0.08434183 0.02692633
## 0.70707071 0.03472476 0.08450518 0.02692155
## 0.71717172 0.03471879 0.08467679 0.02691692
## 0.72727273 0.03471299 0.08484501 0.02691223
## 0.73737374 0.03470737 0.08500904 0.02690763
## 0.74747475 0.03470204 0.08516457 0.02690333
## 0.75757576 0.03469689 0.08531866 0.02689913
## 0.76767677 0.03469187 0.08546999 0.02689489
## 0.77777778 0.03468712 0.08561222 0.02689084
## 0.78787879 0.03468263 0.08574569 0.02688706
## 0.79797980 0.03467816 0.08588459 0.02688342
## 0.80808081 0.03467371 0.08602845 0.02687992
## 0.81818182 0.03466951 0.08616420 0.02687656
## 0.82828283 0.03466558 0.08629123 0.02687362
## 0.83838384 0.03466178 0.08641653 0.02687090
## 0.84848485 0.03465824 0.08653435 0.02686826
## 0.85858586 0.03465500 0.08664271 0.02686577
## 0.86868687 0.03465203 0.08674378 0.02686339
## 0.87878788 0.03464932 0.08683819 0.02686116
## 0.88888889 0.03464686 0.08692620 0.02685913
## 0.89898990 0.03464464 0.08700888 0.02685736
## 0.90909091 0.03464260 0.08708900 0.02685585
## 0.91919192 0.03464078 0.08716488 0.02685461
## 0.92929293 0.03463921 0.08723554 0.02685362
## 0.93939394 0.03463788 0.08730074 0.02685292
## 0.94949495 0.03463680 0.08736066 0.02685239
## 0.95959596 0.03463596 0.08741665 0.02685192
## 0.96969697 0.03463530 0.08747280 0.02685159
## 0.97979798 0.03463481 0.08752788 0.02685147
## 0.98989899 0.03463457 0.08757827 0.02685145
## 1.00000000 0.03463457 0.08762411 0.02685159
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.989899.
## fraction
## 99 0.989899
## Warning: Removed 1 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## [1] "Coefficients"
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## 0.0042670086 -0.0047226721 -0.0009297697 -0.0009512082 -0.0024036919 0.0009623593 -0.0017042610 -0.0012071027
## PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16
## 0.0007540078 -0.0002113743 0.0041423251 -0.0031810173 0.0024316543 0.0044554051 -0.0009745449 0.0016248099
## PC17 PC18 PC19 PC20 PC21
## -0.0006933490 -0.0005913104 -0.0005912956 0.0025484572 -0.0020774230
if (algo.LARS.caret == TRUE){
test.model(model.LARS.caret, data.test
,method = 'lars',subopt = NULL
,formula = formula, feature.names = feature.names, label.names = label.names
,draw.limits = TRUE, transformation = t)
}
## [1] "Summary of predicted values: "
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.066 2.089 2.097 2.097 2.105 2.137
## [1] "lars Test MSE: 0.00114578344250532"
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17134)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United States.1252 LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C LC_TIME=English_United States.1252
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 knitr_1.20 htmltools_0.3.6 reshape2_1.4.3
## [5] lars_1.2 doParallel_1.0.14 iterators_1.0.10 caret_6.0-81
## [9] leaps_3.0 ggforce_0.1.3 rlist_0.4.6.1 car_3.0-2
## [13] carData_3.0-2 bestNormalize_1.3.0 scales_1.0.0 onewaytests_2.0
## [17] caTools_1.17.1.1 mosaic_1.5.0 mosaicData_0.17.0 ggformula_0.9.1
## [21] ggstance_0.3.1 lattice_0.20-35 DT_0.5 ggiraph_0.6.0
## [25] investr_1.4.0 glmnet_2.0-16 foreach_1.4.4 Matrix_1.2-14
## [29] MASS_7.3-50 PerformanceAnalytics_1.5.2 xts_0.11-2 zoo_1.8-4
## [33] forcats_0.3.0 stringr_1.3.1 dplyr_0.7.8 purrr_0.2.5
## [37] readr_1.3.1 tidyr_0.8.2 tibble_1.4.2 ggplot2_3.1.0
## [41] tidyverse_1.2.1 usdm_1.1-18 raster_2.8-4 sp_1.3-1
## [45] pacman_0.5.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.2.0 backports_1.1.3 plyr_1.8.4 lazyeval_0.2.1 splines_3.5.1 mycor_0.1.1
## [7] crosstalk_1.0.0 leaflet_2.0.2 digest_0.6.18 magrittr_1.5 mosaicCore_0.6.0 openxlsx_4.1.0
## [13] recipes_0.1.4 modelr_0.1.2 gower_0.1.2 colorspace_1.3-2 rvest_0.3.2 ggrepel_0.8.0
## [19] haven_2.0.0 crayon_1.3.4 jsonlite_1.5 bindr_0.1.1 survival_2.42-3 glue_1.3.0
## [25] registry_0.5 gtable_0.2.0 ppcor_1.1 ipred_0.9-8 abind_1.4-5 rngtools_1.3.1
## [31] bibtex_0.4.2 Rcpp_1.0.0 xtable_1.8-3 units_0.6-2 foreign_0.8-70 stats4_3.5.1
## [37] lava_1.6.4 prodlim_2018.04.18 htmlwidgets_1.3 httr_1.4.0 RColorBrewer_1.1-2 pkgconfig_2.0.2
## [43] farver_1.1.0 nnet_7.3-12 labeling_0.3 tidyselect_0.2.5 rlang_0.3.1 later_0.7.5
## [49] munsell_0.5.0 cellranger_1.1.0 tools_3.5.1 cli_1.0.1 generics_0.0.2 moments_0.14
## [55] sjlabelled_1.0.17 broom_0.5.1 evaluate_0.12 ggdendro_0.1-20 yaml_2.2.0 ModelMetrics_1.2.2
## [61] zip_2.0.1 nlme_3.1-137 doRNG_1.7.1 mime_0.6 xml2_1.2.0 compiler_3.5.1
## [67] rstudioapi_0.8 curl_3.2 tweenr_1.0.1 stringi_1.2.4 gdtools_0.1.7 pillar_1.3.1
## [73] data.table_1.11.8 bitops_1.0-6 insight_0.1.2 httpuv_1.4.5 R6_2.3.0 promises_1.0.1
## [79] gridExtra_2.3 rio_0.5.16 codetools_0.2-15 assertthat_0.2.0 pkgmaker_0.27 withr_2.1.2
## [85] nortest_1.0-4 mgcv_1.8-24 hms_0.4.2 quadprog_1.5-5 grid_3.5.1 rpart_4.1-13
## [91] timeDate_3043.102 class_7.3-14 rmarkdown_1.11 shiny_1.2.0 lubridate_1.7.4